# Calculate/load Differential Expression metrics for all genes, load SFARI dataset:

# Gene expression data
load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')

# SFARI genes
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
SFARI_genes = SFARI_genes %>% inner_join(datProbes, by=c('gene-symbol'='external_gene_id')) %>%
              mutate('ID' = ensembl_gene_id) %>%
              dplyr::select(ID, `gene-score`, syndromic)

# Balance Groups by covariates, remove singular batches (none)
to_keep = (datMeta$Subject_ID != 'AN03345') & !is.na(datMeta$Dx)
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
  
if(!file.exists('./../working_data/genes_ASD_DE_info.csv')) {
  
  # Calculate differential expression for ASD
  mod = model.matrix(~ Dx, data=datMeta)
  corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
  lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
  
  fit = eBayes(lmfit, trend=T, robust=T)
  top_genes = topTable(fit, coef=2, number=nrow(datExpr))
  genes_DE_info = top_genes[match(rownames(datExpr), rownames(top_genes)),] %>%
                  mutate('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID')
  
  write_csv(genes_DE_info, path='./../working_data/genes_ASD_DE_info.csv')
  
  rm(mod, corfit, lmfit, fit, top_genes)
} else {
genes_DE_info = read_csv('./../working_data/genes_ASD_DE_info.csv')
}

rm(to_keep, datSeq, datProbes)

Scatter plot of genes coloured by score

Genes with an autism score don’t seem to have better behaviour than the rest of the genes:

genes_DE_info = genes_DE_info %>% mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))

# Create subsample of genes without score so plot is not that heavy
genes_DE_info_score = genes_DE_info %>% filter(`gene-score`!='None')
genes_DE_info_sample = genes_DE_info %>% filter(`gene-score`=='None') %>% 
  sample_frac(0.5) %>% bind_rows(genes_DE_info_score)

ggp = ggplotly(genes_DE_info_sample %>% ggplot(aes(adj.P.Val, logFC, color=`gene-score`)) + 
               geom_point(aes(ids=ID, alpha=`gene-score`)) + scale_alpha_discrete(range=c(1, 0.2)) + 
               geom_vline(aes(xintercept=0.03), color='#666666') + scale_color_manual(values=gg_colour_hue(7)) + 
               theme_minimal())
rangeslider(ggp, start=0, end=1.05)
rm(genes_DE_info_score, genes_DE_info_sample, ggp)

Boxplots of the log2 fold change by score

ggplotly(genes_DE_info %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + geom_boxplot() + 
                     scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Perhaps there was an error in the calculation of the differential expression?

Comparing expression density between top and least DE genes

Looks OK…

top_gene_id = genes_DE_info$ID[which.max(abs(genes_DE_info$logFC))]
bottom_gene_id = genes_DE_info$ID[which.min(abs(genes_DE_info$logFC))]
top_bottom_gene_expr = data.frame('Expr_top'=datExpr[rownames(datExpr)==top_gene_id,], 
                                  'Expr_bottom'=datExpr[rownames(datExpr)==bottom_gene_id,],
                                  'Diagnosis'=datMeta$Diagnosis)
p1 = top_bottom_gene_expr %>% ggplot(aes(Expr_bottom, color=Diagnosis, fill=Diagnosis)) + 
                                     geom_density(alpha=0.4) + theme_minimal()
p2 = top_bottom_gene_expr %>% ggplot(aes(Expr_top, color=Diagnosis, fill=Diagnosis)) + 
                                     geom_density(alpha=0.4) + theme_minimal()

grid.arrange(p1, p2, ncol=2)

rm(top_gene_id, bottom_gene_id, top_bottom_gene_expr, p1, p2)

Comparing expression levels between different scores

Boxplots of the average difference in mean between diagnosis by score

Same as with the boxplots for log fold change

datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))

ASD_vs_CTL = data.frame('ID'=rownames(datExpr),
                        'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                        'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>% 
             mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
             left_join(genes_DE_info, by='ID') %>% 
             dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`)

ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + geom_boxplot() +
               scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal()

# ASD_vs_CTL %>% ggplot(aes(`gene-score`, sd_diff, fill=`gene-score`)) + geom_boxplot() +
#               scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal()

rm(datExpr_ASD, datExpr_CTL)

Density plot of the average difference in mean between diagnosis by score

Score 1 outlier has ID = ENSG00000136535 (TBR1), it’s the same one that almost made the cut in the scatter plot

ggplotly(ASD_vs_CTL %>% ggplot(aes(abs(mean_diff), color=`gene-score`, fill=`gene-score`)) + 
                        geom_density(alpha=0.3) + theme_minimal() + 
                        scale_fill_manual(values=gg_colour_hue(7)) + 
                        scale_color_manual(values=gg_colour_hue(7)))

Density plot of the average difference in standard deviation between diagnosis by score

Score 1 outlier is the same one as in the previous plot

ggplotly(ASD_vs_CTL %>% ggplot(aes(abs(sd_diff), color=`gene-score`, fill=`gene-score`)) +
                        geom_density(alpha=0.3) + theme_minimal() +
                        scale_fill_manual(values=gg_colour_hue(7)) +
                        scale_color_manual(values=gg_colour_hue(7)))

Perhaps the error was when performing normalisation?

Comparing expression levels between different scores using raw data and matching IDs to SFARI’s using biomaRt

Loading raw data, modifying metadata and SFARI genes manually and filtering genes

# Load csvs
datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')

# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
all(colnames(datExpr) == rownames(datMeta))
## [1] TRUE
# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position

#################################################################################
# FILTERS:

# 1 Filter probes with start or end position missing (filter 5)
# They can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id

# 3. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

#################################################################################
# Annotate SFARI genes

# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19

gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'), 
                   values=SFARI_genes$`gene-symbol`, mart=mart) %>% mutate('gene-symbol'=hgnc_symbol) %>% 
                   dplyr::select('ensembl_gene_id', 'gene-symbol')

SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')

datExpr_backup = datExpr

rm(getinfo, to_keep, gene_names, mart)

Boxplot of difference in mean between diagnosis by score

  • Score 1 has the highest median, all scores have higher median than the rest of the genes

  • Many outliers (have to use zoom to see the boxes)

datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))

ASD_vs_CTL = data.frame('ID'=rownames(datExpr),
                        'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                        'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
             mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
             left_join(SFARI_genes, by=c('ID'='ensembl_gene_id')) %>%
             dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
             mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Perform Quantile Normalisation

datExpr = datExpr_backup

# Check datProbes and datExpr have same rows
all(rownames(datExpr) == rownames(datProbes))
## [1] TRUE
# Conditional Quantile Normalisation (CQN)
cqn.dat = cqn(datExpr, lengths = datProbes$length,
              x = datProbes$percentage_gc_content,
              lengthMethod = 'smooth',
              sqn = FALSE)  # Run cqn with specified depths and with no quantile normalisation
datExpr = cqn.dat$y + cqn.dat$offset  # Get the log2(Normalised RPKM) values

# Filter out genes with low counts (filter 43406)
pres = apply(datExpr>1, 1, sum) 
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]

Boxplot of difference in mean between diagnosis by score

Scores 1 and 2 have the lowest median!!!

datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))

ASD_vs_CTL = data.frame('ID'=rownames(datExpr),
                        'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                        'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
             mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
             left_join(SFARI_genes, by=c('ID'='ensembl_gene_id')) %>%
             dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
             mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Perform Quantile Normalisation with sqn=TRUE

The cqn documentation says this TRUE default should only be changed by expert users, perhaps they weren’t expert users…

datExpr = datExpr_backup

# Check datProbes and datExpr have same rows
all(rownames(datExpr) == rownames(datProbes))
## [1] TRUE
# Conditional Quantile Normalisation (CQN)
cqn.dat = cqn(datExpr, lengths=datProbes$length ,x=datProbes$percentage_gc_content)
## Using 'sigma' instead 'sig2' (= sigma^2) is preferred now
datExpr = cqn.dat$y + cqn.dat$offset

# Filter out genes with low counts (filter 43601)
pres = apply(datExpr>1, 1, sum) 
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]

Boxplot of difference in mean between diagnosis by score

Doesn’t seem to make much of a difference, if anything, it makes it worse

datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))

ASD_vs_CTL = data.frame('ID'=rownames(datExpr),
                        'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                        'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
             mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
             left_join(SFARI_genes, by=c('ID'='ensembl_gene_id')) %>%
             dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
             mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Other Normalisation techniques?

Using DESeq2, with the normalized option in counts and applying log2 transformation manually

datExpr = datExpr_backup

counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
                    IRanges(datProbes$start_position, width=datProbes$length),
                    strand=datProbes$strand,
                    feature_id=datProbes$ensembl_gene_id)

se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_+Region)

#Estimate size factors
dds = estimateSizeFactors(dds)

#Plot column sums according to size factor
plot(sizeFactors(dds), colSums(counts(dds)))
abline(lm(colSums(counts(dds)) ~ sizeFactors(dds) + 0))

datExpr = log2(counts(dds, normalized=TRUE) + 1)

# Filter out genes with low counts (filter 35303)
pres = apply(datExpr>1, 1, sum) 
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]

Boxplot of difference in mean between diagnosis by score

Same result as before …

datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))

ASD_vs_CTL = data.frame('ID'=rownames(datExpr),
                        'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                        'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
             mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
             left_join(SFARI_genes, by=c('ID'='ensembl_gene_id')) %>%
             dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
             mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Using vst (variance stabilisation normalisation) from DESeq2

vst_output = vst(dds)
datExpr = assay(vst_output)

# Filter out genes with low counts (filter 0)
pres = apply(datExpr>1, 1, sum)
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]

Boxplot of difference in mean between diagnosis by score

Same result as before …

datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))

ASD_vs_CTL = data.frame('ID'=rownames(datExpr),
                        'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                        'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
             mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
             left_join(SFARI_genes, by=c('ID'='ensembl_gene_id')) %>%
             dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
             mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())